#######################################################################
#####                                                            ######
#####   Input: Debates data; Quanteda issue dictionaries         ######
#####   Output: Sentence level scores                            ######
#####                                                            ######
#######################################################################

# Load libraries

library(quanteda) # v3.3.1
library(data.table) # v1.14.8
library(text2vec) # v0.6.3
library(plyr) # v1.8.9

# Load data

load("data/debates.Rdata")
load("working/dictionaries_issues.Rdata")
load("working/word_vectors_150.Rdata")

# Drop pre-Blair years

debates <- debates[debates$parliamentary_term!="1992-1997"]

## Loop over years

year_out_list <- list()
i <- 0
y <- unique(debates$yearmon)[1]

for(y in unique(debates$yearmon)){
  
  print(y)
  i<-i+1
  
  debates_test <- debates[yearmon == y]
  
  # Remove all text between square parentheses
  debates_test$body <- gsub("\\s*\\[[^\\)]+\\]","", debates_test$body)
  
  # Replace hon. identifiers
  debates_test$body <- gsub(" hon."," hon ",debates_test$body)
  
  debates_test_corpus <- corpus(debates_test, text_field = "body")
  
  debates_test_corpus <- corpus_reshape(debates_test_corpus, to = "sentences")
  
  ## Replace parliamentary "numbers" with placeholders so that they are not picked up as fact-based language
  
  raw_texts <- as.character(debates_test_corpus)
  
  debates_test_corpus <- gsub("clause \\d+|schedule \\d+|section \\d+|amendment \\d+|petition \\d+|e-petition \\d+|article \\d+|paragraph \\d+|act \\d+|bill \\d+|in paragraph|after paragraph|clause|schedule|amendment", "parlboilerplate", debates_test_corpus, ignore.case = T)
  
  ## Replace remaining numbers with 000
  
  debates_test_corpus <- gsub("\\d+|\\d+,\\d+|\\d+.\\d+","000", debates_test_corpus)
  
  ## Remove text that is actually Hansard language, not part of the speeches
  
  to_keep <- !grepl("question put and agreed to|I beg to move,|Title of Report|TABLE      Day|Time for conclusion of proceedings|I beg to move|The annual amount of any periodical payment to any person by virtue of her being a widow of a past Member of the House of Commons|Ordered,  That, at the sitting|Additional costs allowance     The annual limit on the additional costs allowance|Resolved,  That|TABLE        Allotted    Day|Table      Proceedings|He is wrong to think that the majority     National Lottery|^Ordered,|^Publication Date|^Supplementary London allowance|^The petition states|^Following is the information|^Following is the full text of the petition|Table       |The following Members took and subscribed the Oath", debates_test_corpus, ignore.case = T)
  
  raw_texts <- raw_texts[to_keep]
  
  debates_test_corpus <- corpus_subset(debates_test_corpus, to_keep)
  
  debates_test_tokens <- tokens(debates_test_corpus)
  
  ## Replace "negation word" with "negation_word"
  
  debates_test_tokens <- tokens_compound(debates_test_tokens, data_dictionary_LSD2015)
  
  sentences_dfm <- dfm(debates_test_tokens)
  
  sentence_length <- ntoken(sentences_dfm)
  
  ## Calculate dictionary counts
  
  dfm_liwc <- dfm_lookup(sentences_dfm, dictionary_to_use)
  
  dict_scores <- as.data.frame(as.matrix(dfm_liwc))
  names(dict_scores) <- paste0("dic_",names(dict_scores))
  
  ## Calculate Glove scores
  
  sentences_dfm_tfidf <- dfm_tfidf(sentences_dfm)
  
  calculate_glove_scores <- function(word_scores_category = word_scores$affect){
    
    category_dfm <- dfm_match(sentences_dfm_tfidf, word_scores_category$word)
    glove_category <- (category_dfm%*%word_scores_category$sigmoid)[,1]  
    return(glove_category)
    
  }
  
  glove_scores <- lapply(names(word_scores), function(x) calculate_glove_scores(word_scores[[x]]))
  glove_scores <- as.data.frame(glove_scores)
  names(glove_scores) <- paste0("glove_", names(word_scores))
  
  year_out <- data.table(epobject_id = docvars(sentences_dfm)$epobject_id,
                         section_id = docvars(sentences_dfm)$section_id,
                         person_id = docvars(sentences_dfm)$person_id,
                         sent = raw_texts,
                         adjusted_sent = as.character(debates_test_corpus), 
                         n_words = sentence_length,
                         glove_scores,
                         dict_scores)
  
  year_out_list[[i]] <- year_out
  
}

## Save all sentence level scores in one object

dictionary_scores_issues <- data.table(rbind.fill(year_out_list))

save(dictionary_scores_issues, file = "working/dictionaries_sentence_issues.Rdata")
